#------------------------------------------------------------------------------
# combine and process ACS micro data sets
#==============================================================================

#------------------------------------------------------------------------------
# first load and combine household and individual level data
#==============================================================================

# load household-level raw data
f.path <- file.path(rawdata_path,'ACS_PUMS')
raw.file.names <- list.files(f.path, pattern="fst")

house.files <- raw.file.names[grep("_h",raw.file.names)] #Due to data size, these are split into two files
individual.files <- raw.file.names[grep("_p",raw.file.names)]

# Load and subset useful household data
# determine variables to keep
    keep_vars <- c('SERIALNO','PUMA','ST','WGTP','NP','TYPE')
    keep_vars <- tolower(keep_vars)
# Only keep states with full information during years 2005:2011
    states <- c(2, 8, 13, 21, 24, 25, 34, 35, 37, 40, 41, 44, 45, 49, 51, 55)
# create vector of years of each file
    tyears <- sapply(house.files, function(x) as.numeric(substr(x, 6, 9)))
# create list to bind results to
    l.acs <- list()
    
# start loop
    for(i in 1:length(house.files)){
    #load data
        df.temp.acs <- read.fst(file.path(f.path, house.files[i]), as.data.table = T)
    #account for R's case sensitivity
        names(df.temp.acs) <- tolower(names(df.temp.acs))
    #account for the fact that this year might be missing some variables
        tvars <- keep_vars[keep_vars %in% names(df.temp.acs)]
    #remove unnecessary states and variables
        df.temp.acs <- df.temp.acs[as.numeric(df.temp.acs$st) %in% states, c(tvars), with = F]
        df.temp.acs$Year <- tyears[i]
    #append to list
        l.acs[[i]]<-df.temp.acs
        print(round(i/length(house.files), 2))
    }
    
# bind the output
    microACS.hh <- rbindlist(l.acs)
    rm(l.acs); gc()

# Load and subset useful individual-level data
# determine variables to keep
    keep_vars <- c('SERIALNO','SPORDER','PUMA','ST','PWGTP','AGEP','CIT','DDRS','DEAR','DEYE','DOUT','DPHY','MAR','SCHL','SEX','ESR','HISP','POVPIP',
        'RACWHT','RACBLK','RACAIAN','RACASN','RACNHPI','RACSOR','RAC1P')
    keep_vars <- tolower(keep_vars)
# Only keep states with full information during years 2005:2011
    states <- c(2, 8, 13, 21, 24, 25, 34, 35, 37, 40, 41, 44, 45, 49, 51, 55)
# create vector of years of each file
    tyears <- sapply(house.files, function(x) as.numeric(substr(x, 6, 9)))
# create list to bind results to
    l.acs <- list()
    
# start loop
    for(i in 1:length(house.files)){
    #load data
        df.temp.acs <- read.fst(file.path(f.path, individual.files[i]),as.data.table = T)
    #account for R's case sensitivity
        names(df.temp.acs) <- tolower(names(df.temp.acs))
    #account for the fact that this year might be missing some variables
        tvars <- keep_vars[keep_vars %in% names(df.temp.acs)]
    #remove unnecessary states and variables
        df.temp.acs <- df.temp.acs[as.numeric(df.temp.acs$st) %in% states, c(tvars),with = F]
        df.temp.acs$year <- tyears[i]
    #append to list
        l.acs[[i]] <- df.temp.acs
        print(round(i/length(house.files), 2))
    }
    
# bind the output
    microACS.ind <- rbindlist(l.acs, fill = T)
    rm(l.acs); gc()

#------------------------------------------------------------------------------
# clean and process data 
#==============================================================================
# Create new data.table based on household level variables
    microACS.hh <- microACS.hh[, "Year", "serialno", "wgtp", "np", "type")]
    setnames(microACS.hh,
        c("Year", "serialno", "wgtp", "np", "type"),
        c("Year", "HseNo", "HseWgt", "HseNP", "HseTyp"))

# rename variables (I know the coding here is a little inefficient...)
    setnames(microACS.ind, 
        c("year", "serialno", "sporder", "pwgtp", "st", "puma", "sex","agep"),
        c("Year", "HseNo", "PerNo", "PerWgt", "State", "Puma", "Sex","Age"))

# create variables
# age group
    microACS.ind[Age < 15,AgeGrp4 := 0]
    microACS.ind[15 <= Age & Age < 25,AgeGrp4 := 1]
    microACS.ind[26 <= Age & Age < 45,AgeGrp4 := 2]
    microACS.ind[46 <= Age & Age < 65,AgeGrp4 := 3]
    microACS.ind[65 <= Age ,AgeGrp4 := 4]

# born in usa
    microACS.ind[cit %in% 2:5, BornUSA := 0]
    microACS.ind[cit %in% 1, BornUSA := 1]

# race : use detailed record
    microACS.ind[rac1p == 1, Race4 := 1]   
    microACS.ind[rac1p == 2, Race4 := 2]   
    microACS.ind[rac1p %in% c(3,4,5), Race4 := 3]   
    microACS.ind[rac1p %in% c(6,7), Race4 := 4]   
    microACS.ind[rac1p %in% c(8), Race4 := 9]   

    microACS.ind[rac1p == 9 & racsor == 1 ,Race4 := 9]
    microACS.ind[rac1p == 9 & racaian == 1, Race4 := 3] 
    microACS.ind[rac1p == 9 & (racasn == 1 | racnhpi == 1), Race4 := 4]  
    microACS.ind[rac1p == 9 & racwht == 1, Race4 := 1]   
    microACS.ind[rac1p == 9 & racblk == 1, Race4 := 2]  

# Hispanic
    microACS.ind[,Hisp := ifelse(hisp != '01', 1, 0)]

# Marital status
    microACS.ind[,MarStat6 := mar]

# Unemployed
    microACS.ind[esr != '',UnEmpl := ifelse(esr == '3', 1, 0)]

# PhysProb
  vars <- c("ddrs", "dear", "deye", "dout", "dphy")

# generate variable
  microACS.ind[,PhysProb := 0]
  microACS.ind[ddrs =='1' | dear == '1' | deye == '1' | dout == '1' | dphy == '1',PhysProb := 1]

# merge household and individial-level data. There is a one to one match so this produces no missings
  microACS_full <- merge(microACS.hh, microACS.ind,  by=c("Year", "HseNo"))

# select only relevant variables
sel_vars = c('Year','HseNo','HseWgt','HseNP','HseTyp','Puma','State','PerWgt','PerNo',
  'Age','Sex','AgeGrp4','MarStat6','BornUSA','Race4','Hisp','UnEmpl','PhysProb')

microACS_full = microACS_full[,..sel_vars]

# reduce data size using integer transformation (later)
vars = grep('PerNo|HseNo|HseWgt|PerWgt',sel_vars, invert = TRUE, value=TRUE)
for (var in vars){
    microACS_full[[var]] = as.integer(microACS_full[[var]])
}

# save files
write_fst(microACS_full, file.path(processed_path, "microACS_full_v1.fst"), 100)

